import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib notebook
data = pd.read_csv("data/une_rt_m.tsv", sep="\t")
codes = pd.read_csv("data/wikipedia-iso-country-codes.csv")
data.head()
col = data["s_adj,age,unit,sex,geo\\time"]
data.drop("s_adj,age,unit,sex,geo\\time", axis=1, inplace=True)
data.columns = pd.to_datetime(list(data.columns), format='%YM%m ').date
data.insert(0, "s_adj,age,unit,sex,geo\\time", col)
data.head()
codes.head()
codes.columns = ['country', 'C2', 'C3', 'numeric', 'iso']
codes.columns
data['C2'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[-1])
data['age'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[1])
data['unit'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[2])
data['sex'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[3])
data['s_adj'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[0])
data.head()
data['C2'].unique()
data['age'].unique()
data['unit'].unique()
# PC_ACT = Percentage of active population
# THS_PER = Thousand persons
data['sex'].unique()
data['s_adj'].unique() # Seasonal adjustment
# NSA = Unadjusted data
# SA = Seasonally adjusted data
# TC = Trend cycle data
data.drop("s_adj,age,unit,sex,geo\\time", axis=1, inplace=True)
cols = ['C2','age','unit','sex', 's_adj']
data_t = data.drop(cols, axis=1)
data_t.head()
columns = data_t.columns[data_t.columns > pd.Timestamp('2010-06')]
cols
data_sel = data[columns]
for col in cols:
data_sel.insert(0, col, data[col])
data_sel.head()
data_sel = data_sel.merge(codes, on="C2")
data_sel.head()
data_sel.shape
data_tr = data_sel.melt(id_vars=["country", "age", "unit", "sex", "s_adj",
"C2", "C3", "numeric", "iso"],
var_name="Date", value_name="Value")
data_tr.head()
data_tr.shape
import re
data_tr['Value'] = data_tr['Value'].apply(lambda x: re.sub(r"[a-zA-Z: ]", "", x))
data_tr['Value'] = data_tr['Value'].apply(lambda x: x.replace(" ",""))
data_tr = data_tr.loc[~(data_tr.Value=="")]
data_tr['Value'] = data_tr['Value'].apply(lambda x: float(x))
len(data_tr['Value'].unique())
data_per = data_tr[(data_tr["unit"]=='PC_ACT')&
(data_tr["s_adj"]=="TC")]
data_th = data_tr[(data_tr["unit"]=='THS_PER')&
(data_tr["s_adj"]=="TC")]
tot_per = pd.DataFrame(data_per.groupby(["age", "sex"])["Value"].mean()).reset_index()
tot_th = pd.DataFrame(data_th.groupby(["age", "sex"])["Value"].mean()).reset_index()
fig = px.bar(y=tot_per["Value"], x=tot_per["age"], color=tot_per["sex"],
barmode='group')
fig.update_layout(title="Porcentaje de desempleo en la Unión Europea",
xaxis_title="Grupo de edad",
yaxis_title="Porcentaje de desempleo",
legend_title="Sexo",
xaxis = dict(
tickvals = ['TOTAL', 'Y25-74', 'Y_LT25'],
ticktext = ['Total', 'Mayores de 25', 'Menores de 25']
))
fig.show()
fig = px.bar(y=tot_th["Value"], x=tot_th["age"], color=tot_th["sex"],
barmode='group')
fig.update_layout(title="Desempleo total en la Unión Europea",
xaxis_title="Grupo de edad",
yaxis_title="Miles de personas",
legend_title="Sexo",
xaxis = dict(
tickvals = ['TOTAL', 'Y25-74', 'Y_LT25'],
ticktext = ['Total', 'Mayores de 25', 'Menores de 25']
))
fig.show()
most_un = data_per.groupby("country")["Value"].mean().nlargest(10)
df_most = pd.DataFrame(most_un).reset_index()
df_most
fig = px.bar(y=df_most["country"], x=df_most["Value"],
color=df_most["country"], text=round(df_most["Value"], 2))
fig.update_layout(title="PaÃses de la Unión Europea con un mayor desempleo",
xaxis_title="Porcentaje de desempleo",
yaxis_title="PaÃs")
fig.show()
less_un = data_per.groupby("country")["Value"].mean().nsmallest(10)[::-1]
df_less = pd.DataFrame(less_un).reset_index()
df_less
fig = px.bar(y=df_less["country"], x=df_less["Value"],
color=df_less["country"], text=round(df_less["Value"], 2))
fig.update_layout(title="PaÃses de la Unión Europea con un menor desempleo",
xaxis_title="Porcentaje de desempleo",
yaxis_title="PaÃs")
fig.show()
total_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
(data_tr.unit=='PC_ACT')&
(data_tr.sex=='T')&
(data_tr.s_adj=='TC')]
def plot_time_variation(df, y='Value', size=1, is_log=False, title=""):
f, ax = plt.subplots(1,1, figsize=(4*size,3*size))
countries = list(df.country.unique())
for country in countries:
df_ = df[(df['country']==country)]
g = sns.lineplot(x="Date", y=y, data=df_, label=country)
ax.text(max(df_['Date']), (df_.loc[df_['Date']==max(df_['Date']), y]), str(country))
plt.xticks(rotation=90)
plt.title(f'Desempleo total, {title}, agrupado por paÃs')
ax.text(max(df_['Date']), (df_.loc[df_['Date']==max(df_['Date']), y]), str(country))
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
if(is_log):
ax.set(yscale="log")
ax.grid(color='black', linestyle='dotted', linewidth=0.75)
ax.set_xlabel("Fecha")
ax.set_ylabel("Valor")
plt.show()
plot_time_variation(total_y25_74, size=4,
is_log=True, title = "edad 24-75 -")
total_F_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
(data_tr.unit=='PC_ACT')&
(data_tr.sex=='F')&
(data_tr.s_adj=='TC')]
plot_time_variation(total_F_y25_74, size=4, is_log=True,
title = "mujeres, edad 24-75 ")
total_M_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
(data_tr.unit=='PC_ACT')&
(data_tr.sex=='M')&
(data_tr.s_adj=='TC')]
plot_time_variation(total_M_y25_74, size=4, is_log=True,
title = "hombres, edad 24-75 ")
total_M_y25 = data_tr.loc[(data_tr.age=='Y_LT25')&
(data_tr.unit=='PC_ACT')&
(data_tr.sex=='M')&
(data_tr.s_adj=='TC')]
plot_time_variation(total_M_y25, size=4, is_log=True,
title = "hombres, menores de 25 años ")
total_F_y25 = data_tr.loc[(data_tr.age=='Y_LT25')&
(data_tr.unit=='PC_ACT')&
(data_tr.sex=='F')&
(data_tr.s_adj=='TC')]
plot_time_variation(total_F_y25, size=4, is_log=True,
title = "mujeres, menores de 25 años")
data_tr = data_tr[data_tr["s_adj"]=='TC']
data_tr.head()
data_yr = data_per.groupby([data_tr['Date'].map(lambda x: x.year), "country", "age", "sex"]).mean()
dt_yr = data_yr.drop("numeric", axis=1).reset_index()
dt_yr
def plot_time_variation_age_sex(data_tr_df, y='Value', country="Netherlands"):
c_df = data_tr_df.loc[(data_tr_df.country==country)&
(data_tr_df.unit=='PC_ACT')&
(data_tr_df.s_adj=='TC')]
f, ax = plt.subplots(1,1, figsize=(16,12))
sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y_LT25')&
(c_df.sex=='F')],
label="Mujeres, menores de 25 años")
sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y_LT25')&
(c_df.sex=='M')],
label="Hombres, menores de 25 años")
sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y25-74')&
(c_df.sex=='F')],
label="Mujeres, mayores de 25 años")
sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y25-74')&
(c_df.sex=='M')],
label="Hombres, mayores de 25 años")
plt.xticks(rotation=90)
plt.title(f'Desempleo total en {country}, agrupados por edad y sexo')
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
ax.grid(color='black', linestyle='dotted', linewidth=0.75)
ax.set_xlabel("Fecha")
ax.set_ylabel("Porcentaje")
plt.show()
s_yr = dt_yr.groupby(["Date","sex","age"])["Value"].mean()
df_yr = pd.DataFrame(s_yr).reset_index()
plt.figure(figsize=(12,9))
ax = sns.lineplot(x="Date", y="Value", hue="age", style="sex", data=df_yr)
ax.set_title("Evolución del desempleo en la Unión Europea")
ax.set_xlabel("Fecha")
ax.set_ylabel("Porcentaje");
data_1920 = data_tr[data_tr["Date"]>pd.to_datetime("2019-02")]
plot_time_variation_age_sex(data_tr, country="Spain")
plot_time_variation_age_sex(data_1920, country="Spain")
plot_time_variation_age_sex(data_tr, country="Croatia")
plot_time_variation_age_sex(data_1920, country="Croatia")
plot_time_variation_age_sex(data_tr,country="Portugal")
plot_time_variation_age_sex(data_1920, country="Portugal")
plot_time_variation_age_sex(data_tr,country="Switzerland")
plot_time_variation_age_sex(data_1920, country="Switzerland")
plot_time_variation_age_sex(data_tr,country="Norway")
plot_time_variation_age_sex(data_1920,country="Norway")
plot_time_variation_age_sex(data_tr, country="Germany")
plot_time_variation_age_sex(data_1920, country="Germany")